pacman::p_load(
#data wrangling
tidyverse, stringr,
#data visualization
ggplot2, RColorBrewer, ggsci,
plotly, ggpubr, vtable
)
Reading CSV file
forbes <- read_csv("2022_forbes_billionaires.csv") %>% janitor::clean_names()
## New names:
## Rows: 2600 Columns: 8
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (5): name, networth, country, source, industry dbl (3): ...1, rank, age
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
forbes
## # A tibble: 2,600 × 8
## x1 rank name networth age country source indus…¹
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 0 1 Elon Musk $219 B 50 United St… Tesla… Automo…
## 2 1 2 Jeff Bezos $171 B 58 United St… Amazon Techno…
## 3 2 3 Bernard Arnault & family $158 B 73 France LVMH Fashio…
## 4 3 4 Bill Gates $129 B 66 United St… Micro… Techno…
## 5 4 5 Warren Buffett $118 B 91 United St… Berks… Financ…
## 6 5 6 Larry Page $111 B 49 United St… Google Techno…
## 7 6 7 Sergey Brin $107 B 48 United St… Google Techno…
## 8 7 8 Larry Ellison $106 B 77 United St… softw… Techno…
## 9 8 9 Steve Ballmer $91.4 B 66 United St… Micro… Techno…
## 10 9 10 Mukesh Ambani $90.7 B 64 India diver… Divers…
## # … with 2,590 more rows, and abbreviated variable name ¹industry
#Check the content of the dataset
glimpse(forbes)
## Rows: 2,600
## Columns: 8
## $ x1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ rank <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ name <chr> "Elon Musk", "Jeff Bezos", "Bernard Arnault & family", "Bill …
## $ networth <chr> "$219 B", "$171 B", "$158 B", "$129 B", "$118 B", "$111 B", "…
## $ age <dbl> 50, 58, 73, 66, 91, 49, 48, 77, 66, 64, 59, 80, 82, 68, 37, 7…
## $ country <chr> "United States", "United States", "France", "United States", …
## $ source <chr> "Tesla, SpaceX", "Amazon", "LVMH", "Microsoft", "Berkshire Ha…
## $ industry <chr> "Automotive", "Technology", "Fashion & Retail", "Technology",…
#Rename the columns
forbes <- tibble("Rank"=forbes$rank,
"Names"=forbes$name,
"Networth"=forbes$networth,
"Age"=forbes$age,
"Country"=forbes$country,
"Source"=forbes$source,
"Industry"=forbes$industry)
glimpse(forbes)
## Rows: 2,600
## Columns: 7
## $ Rank <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Names <chr> "Elon Musk", "Jeff Bezos", "Bernard Arnault & family", "Bill …
## $ Networth <chr> "$219 B", "$171 B", "$158 B", "$129 B", "$118 B", "$111 B", "…
## $ Age <dbl> 50, 58, 73, 66, 91, 49, 48, 77, 66, 64, 59, 80, 82, 68, 37, 7…
## $ Country <chr> "United States", "United States", "France", "United States", …
## $ Source <chr> "Tesla, SpaceX", "Amazon", "LVMH", "Microsoft", "Berkshire Ha…
## $ Industry <chr> "Automotive", "Technology", "Fashion & Retail", "Technology",…
# check out for duplicate values and null values
duplicate <- unique(forbes)
duplicate
## # A tibble: 2,600 × 7
## Rank Names Networth Age Country Source Indus…¹
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 1 Elon Musk $219 B 50 United States Tesla, S… Automo…
## 2 2 Jeff Bezos $171 B 58 United States Amazon Techno…
## 3 3 Bernard Arnault & family $158 B 73 France LVMH Fashio…
## 4 4 Bill Gates $129 B 66 United States Microsoft Techno…
## 5 5 Warren Buffett $118 B 91 United States Berkshir… Financ…
## 6 6 Larry Page $111 B 49 United States Google Techno…
## 7 7 Sergey Brin $107 B 48 United States Google Techno…
## 8 8 Larry Ellison $106 B 77 United States software Techno…
## 9 9 Steve Ballmer $91.4 B 66 United States Microsoft Techno…
## 10 10 Mukesh Ambani $90.7 B 64 India diversif… Divers…
## # … with 2,590 more rows, and abbreviated variable name ¹Industry
Null_values <- which(is.na(duplicate))
#Remove the dollar and billion sign from Networth
forbes$Networth <- as.numeric(str_replace_all(forbes$Networth, "[$B]", ""))
#check the data types of the data set
summary(forbes)
## Rank Names Networth Age
## Min. : 1 Length:2600 Min. : 1.000 Min. : 19.00
## 1st Qu.: 637 Class :character 1st Qu.: 1.500 1st Qu.: 55.00
## Median :1292 Mode :character Median : 2.400 Median : 64.00
## Mean :1270 Mean : 4.861 Mean : 64.27
## 3rd Qu.:1929 3rd Qu.: 4.500 3rd Qu.: 74.00
## Max. :2578 Max. :219.000 Max. :100.00
## Country Source Industry
## Length:2600 Length:2600 Length:2600
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
statistics <- forbes[,c("Age","Networth")] %>% st(out="csv")%>%
as.data.frame()
## Warning in st(., out = "csv"): out = "csv" will just return the vtable as a
## data.frame unless combined with file
statistics
## Variable N Mean Std. Dev. Min Pctl. 25 Pctl. 75 Max
## 1 Age 2600 64.272 13.221 19 55 74 100
## 2 Networth 2600 4.861 10.66 1 1.5 4.5 219
Top <- forbes %>% head(10)%>% ggplot(aes(reorder(x=Names,-Networth),y=Networth,fill=Names)) +
geom_bar(stat="identity")+
geom_text(aes(label=Networth),cex=2.5,vjust=0.05)+
labs(title="Top 10 billionaires and their networth",
x="Names",y="Networth")+
theme_classic()+scale_fill_brewer(palette="Spectral")+
theme(axis.text.x=element_text(angle=45,vjust=0.5))
Top
Countries<- forbes %>% head(10)%>%
plot_ly(x=~Names, y=~Networth, type="scatter",
mode="markers",marker= list(size=~Networth),color=~Country)%>%
layout(title="Countries of top 10 billionaires")
Countries
Industries<- forbes %>% head(10)%>%
plot_ly(x=~Names, y=~Networth, type="scatter",
mode="markers",marker= list(size=~Networth),color=~Industry)%>%
layout(title="Industry of top 10 billionaires")
Industries
Source<- forbes %>% head(10)%>%
plot_ly(x=~Names, y=~Networth, type="scatter",
mode="markers",marker= list(size=~Networth),color=~Source)%>%
layout(title="Sources of top 10 billionaires")
Source
bar <- forbes %>% head(10)%>% ggplot(aes(reorder(x=Names, -Age),y=Age,fill=Names)) +
geom_bar(stat="identity")+
geom_text(aes(label=Age),cex=2.5,vjust=0.05)+
labs(title="Top 10 billionaires and their age",
x="Names",y="Networth")+
theme_classic()+scale_fill_brewer(palette="BrBG")+
theme(axis.text.x=element_text(angle=45,vjust=0.5))
bar
forbes_Old <- forbes[order(forbes$Age, decreasing = TRUE), ]
Forbes_Old2 <- forbes_Old %>% head(10) %>%ggplot(aes(x=Names,y=Age,fill=Names))+
geom_bar(stat="identity")+
geom_text(aes(label=Age),cex=2.5,vjust=0.05)+
labs(title="Top 10 oldest billionaires",
x="Networth",y="Age")+
theme_classic()+scale_fill_brewer(palette="PuOr")+
theme(axis.text.x=element_text(angle=45,vjust=0.5))
Forbes_Old2
forbes_Old <- forbes[order(forbes$Age, decreasing = TRUE), ]
Tail_10 <- forbes_Old %>% tail(10)%>% ggplot(aes(x=Names,y=Age,fill=Names))+
geom_bar(stat="identity")+
geom_text(aes(label=Age),cex=2.5,vjust=0.05)+
labs(title="Top 10 youngest billionaires", x="Names",y="Age")+
theme_classic()+scale_fill_brewer(palette="PuOr")+
theme(axis.text.x=element_text(angle=45,vjust=0.5))
Tail_10
worth <- forbes %>% select(Country,Networth) %>% group_by(Country)%>% summarise(Top_billionaire=sum(Networth))%>%arrange(desc(Top_billionaire))
Top_countries <- worth %>% head(10)
options(warn = -1)
Top_countries1 <- Top_countries %>% plot_ly(x=~Country, y=~Top_billionaire, type="scatter",
mode="markers",size=10,color=~Country,colors="Dark2")%>%
layout(title="Total networth of billionaires in top 10 countries",
yaxis=list(title="Networth"))
Top_countries1
Age <- forbes%>%plot_ly(x=~Age, type="histogram", color=~Age,colors="Paired")%>%
layout(title="Distribution of Age in billionaires",yaxis=list(title="Count"))
Age
Industries <- forbes %>% group_by(Country,Industry)%>% summarize(n=n())%>% arrange(desc=n)
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
Industries
## # A tibble: 448 × 3
## # Groups: Country [75]
## Country Industry n
## <chr> <chr> <int>
## 1 Algeria Food & Beverage 1
## 2 Argentina Diversified 1
## 3 Argentina Healthcare 1
## 4 Argentina Real Estate 1
## 5 Argentina Technology 1
## 6 Australia Automotive 1
## 7 Australia Diversified 1
## 8 Australia Gambling & Casinos 1
## 9 Australia Logistics 1
## 10 Australia Media & Entertainment 1
## # … with 438 more rows
United <- Industries %>% filter(Country=="United States")
fun_color_range <- colorRampPalette(c("Blue","Yellow","Green","Red"))
my_colors <- fun_color_range(18)
my_colors
## [1] "#0000FF" "#2D2DD2" "#5A5AA4" "#878778" "#B4B44A" "#E1E11D" "#F0FF00"
## [8] "#C3FF00" "#95FF00" "#68FF00" "#3BFF00" "#0EFF00" "#1DE100" "#4AB400"
## [15] "#778700" "#A45A00" "#D22D00" "#FF0000"
United_States <- United %>% plot_ly(x=~Industry,y=~n, type="scatter",
mode="markers",size=~n,color=~Industry,colors=my_colors)%>%
layout(title="United States networth in all industries",
yaxis=list(title="United States"))
United
## # A tibble: 18 × 3
## # Groups: Country [1]
## Country Industry n
## <chr> <chr> <int>
## 1 United States Metals & Mining 2
## 2 United States Construction & Engineering 5
## 3 United States Telecom 5
## 4 United States Gambling & Casinos 6
## 5 United States Logistics 6
## 6 United States Diversified 15
## 7 United States Automotive 16
## 8 United States Service 18
## 9 United States Sports 24
## 10 United States Manufacturing 25
## 11 United States Energy 32
## 12 United States Healthcare 32
## 13 United States Media & Entertainment 41
## 14 United States Real Estate 46
## 15 United States Fashion & Retail 53
## 16 United States Food & Beverage 63
## 17 United States Technology 137
## 18 United States Finance & Investments 193
China <- Industries %>% filter(Country=="China")
China_Industries <- China%>% plot_ly(x=~Industry,y=~n, type="scatter",
mode="markers",size=~n,color=~Industry,colors=my_colors)%>%
layout(title="China networth in all industries",
yaxis=list(title="China"))
China_Industries
India <- Industries %>% filter(Country=="India")
India_Industries <- India%>% plot_ly(x=~Industry,y=~n, type="scatter",
mode="markers",size=~n,color=~Industry,colors=my_colors)%>%
layout(title="India networth in all industries",
yaxis=list(title="India"))
India_Industries
Germany <- Industries %>% filter(Country=="Germany")
Germany_Industries <- Germany%>% plot_ly(x=~Industry,y=~n, type="scatter",
mode="markers",size=~n,color=~Industry,colors=my_colors)%>%
layout(title="Germany networth in all industries",
yaxis=list(title="Germany"))
Germany_Industries
France <- Industries %>% filter(Country=="France")
France_Industries <- France%>% plot_ly(x=~Industry,y=~n, type="scatter",
mode="markers",size=~n,color=~Industry,colors=my_colors)%>%
layout(title="France networth in all industries",
yaxis=list(title="France"))
France_Industries
bar <- forbes %>% head(10)%>% ggplot(aes(reorder(x=Names, -Age),y=Age,fill=Names)) +
geom_bar(stat="identity")+
geom_text(aes(label=Age),cex=2.5,vjust=0.05)+
labs(title="Top 10 billionaires and their age",
x="Names",y="Networth")+
theme_classic()+scale_fill_brewer(palette="BrBG")+
theme(axis.text.x=element_text(angle=45,vjust=0.5))
bar
# old billionaires
forbes_Old <- forbes[order(forbes$Age, decreasing = TRUE), ]
Forbes_Old3 <- forbes_Old %>% head(10) %>%ggplot(aes(x=Names,y=Networth,fill=Source))+
geom_bar(stat="identity")+
geom_text(aes(label=Networth),cex=2.5,vjust=0.05)+
labs(title="Top 10 oldest billionaires and their sources",
x="Names",y="Networth")+
theme_classic()+scale_fill_brewer(palette="PuOr")+
theme(axis.text.x=element_text(angle=45,vjust=0.5))
Forbes_Old3
## youngest billionaire and their sources
forbes_Old <- forbes[order(forbes$Age, decreasing = TRUE), ]
Tail_10 <- forbes_Old %>% tail(10)%>% ggplot(aes(x=Names,y=Networth,fill=Source))+
geom_bar(stat="identity")+
geom_text(aes(label=Networth),cex=2.5,vjust=0.05)+
labs(title="Top 10 youngest billionaires and their sources", x="Names",y="Networth")+
theme_classic()+scale_fill_brewer(palette="PuOr")+
theme(axis.text.x=element_text(angle=45,vjust=0.5))
Tail_10
## Total networth of billionaires in top 10 sources
Sources <- forbes %>%group_by(Source) %>% summarise(Total_Networth=sum(Networth))%>%arrange(desc(Total_Networth))
Sources
## # A tibble: 895 × 2
## Source Total_Networth
## <chr> <dbl>
## 1 real estate 574.
## 2 diversified 382
## 3 investments 358.
## 4 software 290.
## 5 pharmaceuticals 284.
## 6 hedge funds 272.
## 7 Google 261.
## 8 Walmart 238
## 9 Microsoft 232.
## 10 Tesla, SpaceX 219
## # … with 885 more rows
## Total networth of billionaires in top 10 sources
Top_Billionaire_sources<- Sources %>%head(10) %>% plot_ly(x=~Source,y=~Total_Networth, type="scatter",
mode="markers",size=10,color=~Source,colors=my_colors)%>%
layout(title="Total networth of billionaires in top 10 sources",
yaxis=list(title="Networth"))
Top_Billionaire_sources
## Total networth of billionaires in top 10 Industries
Industry <- forbes %>%group_by(Industry) %>% summarise(Total_Networth=sum(Networth))%>%arrange(desc(Total_Networth))
Industry
## # A tibble: 18 × 2
## Industry Total_Networth
## <chr> <dbl>
## 1 Technology 2168.
## 2 Finance & Investments 1734.
## 3 Fashion & Retail 1613.
## 4 Manufacturing 1080.
## 5 Diversified 940.
## 6 Food & Beverage 933.
## 7 Healthcare 709.
## 8 Real Estate 686.
## 9 Automotive 583.
## 10 Media & Entertainment 494.
## 11 Energy 395.
## 12 Metals & Mining 390.
## 13 Telecom 205.
## 14 Logistics 196.
## 15 Service 186.
## 16 Construction & Engineering 121.
## 17 Gambling & Casinos 108.
## 18 Sports 97.6
## Total networth of billionaires in top 10 Industries
Top_Billionaire_Industry<- Industry %>%head(10) %>% plot_ly(x=~Industry,y=~Total_Networth, type="scatter", mode="markers",size=10,color=~Industry,colors=my_colors)%>%
layout(title="Total networth of billionaires in top 10 Industries",
yaxis=list(title="Networth"))
Top_Billionaire_Industry
#Session Info
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] vtable_1.3.4 kableExtra_1.3.4 ggpubr_0.4.0 plotly_4.10.0
## [5] ggsci_2.9 RColorBrewer_1.1-3 forcats_0.5.2 stringr_1.4.1
## [9] dplyr_1.0.10 purrr_0.3.5 readr_2.1.3 tidyr_1.2.1
## [13] tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] fs_1.5.2 lubridate_1.8.0 bit64_4.0.5
## [4] insight_0.18.6 webshot_0.5.4 httr_1.4.4
## [7] tools_4.2.2 backports_1.4.1 bslib_0.4.0
## [10] sjlabelled_1.2.0 utf8_1.2.2 R6_2.5.1
## [13] DBI_1.1.3 lazyeval_0.2.2 colorspace_2.0-3
## [16] withr_2.5.0 tidyselect_1.2.0 bit_4.0.4
## [19] compiler_4.2.2 cli_3.4.1 rvest_1.0.3
## [22] pacman_0.5.1 xml2_1.3.3 labeling_0.4.2
## [25] sass_0.4.2 scales_1.2.1 systemfonts_1.0.4
## [28] digest_0.6.29 rmarkdown_2.17 svglite_2.1.0
## [31] pkgconfig_2.0.3 htmltools_0.5.3 highr_0.9
## [34] dbplyr_2.2.1 fastmap_1.1.0 htmlwidgets_1.5.4
## [37] rlang_1.0.6 readxl_1.4.1 rstudioapi_0.14
## [40] farver_2.1.1 jquerylib_0.1.4 generics_0.1.3
## [43] jsonlite_1.8.2 crosstalk_1.2.0 vroom_1.6.0
## [46] car_3.1-1 googlesheets4_1.0.1 magrittr_2.0.3
## [49] munsell_0.5.0 fansi_1.0.3 abind_1.4-5
## [52] lifecycle_1.0.3 stringi_1.7.8 yaml_2.3.5
## [55] snakecase_0.11.0 carData_3.0-5 grid_4.2.2
## [58] parallel_4.2.2 crayon_1.5.2 haven_2.5.1
## [61] hms_1.1.2 knitr_1.40 pillar_1.8.1
## [64] ggsignif_0.6.4 reprex_2.0.2 glue_1.6.2
## [67] evaluate_0.17 data.table_1.14.2 modelr_0.1.9
## [70] vctrs_0.4.2 tzdb_0.3.0 cellranger_1.1.0
## [73] gtable_0.3.1 assertthat_0.2.1 cachem_1.0.6
## [76] xfun_0.33 janitor_2.1.0 broom_1.0.1
## [79] rstatix_0.7.0 googledrive_2.0.0 viridisLite_0.4.1
## [82] gargle_1.2.1 ellipsis_0.3.2